In [1]:
import pandas as pd

# to communicate with Google BigQuery
from pandas.io import gbq
In [2]:
import plotly.plotly as py
import plotly.graph_objs as go
In [3]:
import cufflinks as cf
In [5]:
project_id = 'poised-list-163221'

This procedure creates a grouped bar chart showing prices/cost for different drugs and procedures, grouped by region

It also adds a line graph showing the volume of prescribed drugs and completed procedures for the same regions. The volume is represented on the right y-axis.

In [154]:
def createbarfig(df_max, x_max, y_max, df_min, x_min, y_min, claims, region, x_axis, y_axis, title):

    cf.set_config_file(offline = True, world_readable = True, theme = 'pearl')
    trace1 = go.Bar(
                    x =  df_max[x_max], 
                    y =  df_max[y_max],
                    name = 'max',
                    text = df_max[region],
                    xaxis = 'x',
                    yaxis = 'y', 
                    opacity = 0.6
    )
    trace2 = go.Bar(
                    x = df_min[x_min],
                    y = df_min[y_min],
                    name = 'min',
                    text = df_min[region],
                    xaxis = 'x', 
                    yaxis = 'y', 
                    opacity = 0.6
    )
    trace3 = go.Scatter(
                    x = df_min[x_min],
                    y = df_min[claims],
                    name = 'min_claims',
                    text = df_min[region], 
                    xaxis = 'x',
                    yaxis = 'y2', 
                    marker = dict(color = 'rgb(255,178,102)' )
        
        )
    
    trace4 = go.Scatter (
                    x = df_max[x_max],
                    y = df_max[claims],
                    name = 'max_claims',
                    text = df_max[region],
                    xaxis = 'x',
                    yaxis = 'y2',
                    marker = dict(color = 'rgb(102,178,255)')
    
    )
    data = [trace1, trace2, trace3, trace4]
    layout = go.Layout(
    barmode = 'group',
    margin = dict(b = 250), 
    yaxis = dict(title = y_axis, side = 'left', anchor = 'x', overlaying = 'y2'),
    yaxis2 = dict(title = 'number_claims', side = 'right', anchor = 'x'),
    xaxis = dict(title = x_axis), 
    title = title, 
    autosize = False, 
    height = 750,
    width = 750,
    legend = dict(x =-.1, y = 1.2))


    fig = go.Figure(data = data, layout=layout)

    return fig
    

This query selects the region, and total claim numbers for each drug where the unitcost is at a minimum among the top 10 prescribed list from the 2014 Medicare claims public dataset.

In [78]:
minquery = """
select * from
(
select main_table.drug, main_table.region, main_table.unitcost, main_table.total_claims from 
(select drug,  round(min(unitcost),4) as min_cost
from 
(
select region, drug, unitcost
from
(
select concat(city,' ',state) as region, drug, unitcost 
from 
(SELECT nppes_provider_city AS city, nppes_provider_state as state, drug_name as drug, 
(sum(total_drug_cost)/sum(total_day_supply)) AS unitcost
FROM [bigquery-public-data:medicare.part_d_prescriber_2014]
GROUP BY city, state, drug
) as sq1) as sq2
where drug in 
(select drug_name from (select drug_name, sum(total_claim_count) as cost
from [bigquery-public-data:medicare.part_d_prescriber_2014]
group by drug_name
order by cost desc
limit 10) as sq3) 
and region in 
(select concat(city,' ',state) as region from (select nppes_provider_city as city, 
nppes_provider_state as state, sum(total_claim_count) as total_claims
from [bigquery-public-data:medicare.part_d_prescriber_2014]
group by city, state
order by total_claims desc
limit 10) as sq4)
order by drug, unitcost
)as sq5 
group by drug)as min_table

join 

(select region, drug, round(unitcost,4) as unitcost, total_claims
from
(
select concat(city,' ',state) as region, drug, unitcost, total_claims
from 
(SELECT nppes_provider_city AS city, nppes_provider_state as state, drug_name as drug, sum(total_claim_count) as total_claims,
(sum(total_drug_cost)/sum(total_day_supply)) AS unitcost
FROM [bigquery-public-data:medicare.part_d_prescriber_2014]
GROUP BY city, state, drug
) as sq1) as sq2
where drug in 
(select drug_name from (select drug_name, sum(total_claim_count) as total_claims
from [bigquery-public-data:medicare.part_d_prescriber_2014]
group by drug_name
order by total_claims desc
limit 10) as sq3) 
and region in 
(select concat(city,' ',state) as region from (select nppes_provider_city as city, 
nppes_provider_state as state, sum(total_claim_count) as total_claims
from [bigquery-public-data:medicare.part_d_prescriber_2014]
group by city, state
order by total_claims desc
limit 10)))as main_table
on main_table.unitcost = min_table.min_cost and main_table.drug = min_table.drug
) as sqx order by main_table.drug
"""
In [79]:
drug_price_min_df = gbq.read_gbq(minquery, project_id = project_id)
Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
Got 10 rows.

Total time taken 1.41 s.
Finished at 2017-05-18 19:32:03.

This query selects the region, and total claim numbers for each drug where the unitcost is at a maximum among the top 10 prescribed list from the 2014 Medicare claims public dataset.

In [97]:
maxquery = """
select * from
(
select main_table.drug, main_table.region, main_table.unitcost, main_table.total_claims from 
(select drug,  round(max(unitcost),4) as max_cost
from 
(
select region, drug, unitcost
from
(
select concat(city,' ',state) as region, drug, unitcost 
from 
(SELECT nppes_provider_city AS city, nppes_provider_state as state, drug_name as drug, 
(sum(total_drug_cost)/sum(total_day_supply)) AS unitcost
FROM [bigquery-public-data:medicare.part_d_prescriber_2014]
GROUP BY city, state, drug
) as sq1) as sq2
where drug in 
(select drug_name from (select drug_name, sum(total_claim_count) as cost
from [bigquery-public-data:medicare.part_d_prescriber_2014]
group by drug_name
order by cost desc
limit 10) as sq3) 
and region in 
(select concat(city,' ',state) as region from (select nppes_provider_city as city, 
nppes_provider_state as state, sum(total_claim_count) as total_claims
from [bigquery-public-data:medicare.part_d_prescriber_2014]
group by city, state
order by total_claims desc
limit 10) as sq4)
order by drug, unitcost
)as sq5 
group by drug)as max_table

join 

(select region, drug, round(unitcost,4) as unitcost, total_claims
from
(
select concat(city,' ',state) as region, drug, unitcost, total_claims
from 
(SELECT nppes_provider_city AS city, nppes_provider_state as state, drug_name as drug, sum(total_claim_count) as total_claims,
(sum(total_drug_cost)/sum(total_day_supply)) AS unitcost
FROM [bigquery-public-data:medicare.part_d_prescriber_2014]
GROUP BY city, state, drug
) as sq1) as sq2
where drug in 
(select drug_name from (select drug_name, sum(total_claim_count) as total_claims
from [bigquery-public-data:medicare.part_d_prescriber_2014]
group by drug_name
order by total_claims desc
limit 10) as sq3) 
and region in 
(select concat(city,' ',state) as region from (select nppes_provider_city as city, 
nppes_provider_state as state, sum(total_claim_count) as total_claims
from [bigquery-public-data:medicare.part_d_prescriber_2014]
group by city, state
order by total_claims desc
limit 10)))as main_table
on main_table.unitcost = max_table.max_cost and main_table.drug = max_table.drug
) as sqx order by main_table.drug
"""
In [98]:
drug_price_max_df = gbq.read_gbq(maxquery, project_id = project_id)
Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
Got 10 rows.

Total time taken 0.89 s.
Finished at 2017-05-18 19:54:58.
In [141]:
drug_dif_fig = createbarfig(drug_price_max_df, 'main_table_drug', 'main_table_unitcost',\
                           drug_price_min_df, 'main_table_drug', 'main_table_unitcost',\
                            'main_table_total_claims', \
                           'main_table_region', 'drug name', 'cost/unit in $',\
                           'Price difference b/w frequently prescribed Medicare Rx')
In [178]:
cf.iplot(drug_dif_fig)
In [143]:
maxquery_inpatient = """

select maint.dx, maint.region, maint.total_cost, maint.discharge_num
from
(
select dx, max(total_cost) as max_cost
from 
(
select dx, region, round(total_cost,0) as total_cost
from
(
select dx, concat(city,' ',state) as region, total_cost
from
(
select drg_definition as dx, provider_city as city, provider_state as state, 
sum(average_medicare_payments) as total_cost
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by dx, city, state
) as sq) as sq2
where dx in (select dx from
(
select drg_definition as dx, sum(total_discharges) as discharge_num
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by dx
order by discharge_num desc
limit 10
)as sq3)
and region in 
(select concat(city,' ',state) as region from
(select provider_city as city, provider_state as state, sum(total_discharges) as discharge_num
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by city, state
order by discharge_num desc
limit 10
)as sq4)
order by dx, total_cost desc
) as sq5
group by dx
) as maxt
JOIN 
(select dx, region, round(total_cost,0) as total_cost, discharge_num
from
(
select dx, concat(city,' ',state) as region, total_cost, discharge_num
from
(
select drg_definition as dx, provider_city as city, provider_state as state, sum(total_discharges) as discharge_num,
sum(average_medicare_payments) as total_cost
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by dx, city, state
) as sq) as sq2
where dx in (select dx from
(
select drg_definition as dx, sum(total_discharges) as discharge_num
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by dx
order by discharge_num desc
limit 10
)as sq3)
and region in 
(select concat(city,' ',state) as region from
(select provider_city as city, provider_state as state, sum(total_discharges) as discharge_num
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by city, state
order by discharge_num desc
limit 10
)as sq4)
order by dx, total_cost desc) as maint	
on maint.total_cost = maxt.max_cost and maint.dx = maxt.dx
order by maint.dx

"""
In [144]:
inpatient_max_df = gbq.read_gbq(maxquery_inpatient, project_id = project_id)
Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
Got 10 rows.

Total time taken 1.44 s.
Finished at 2017-05-18 22:55:08.
In [145]:
minquery_inpatient = """

select maint.dx, maint.region, maint.total_cost, maint.discharge_num
from
(
select dx, min(total_cost) as min_cost
from 
(
select dx, region, round(total_cost,0) as total_cost
from
(
select dx, concat(city,' ',state) as region, total_cost
from
(
select drg_definition as dx, provider_city as city, provider_state as state,  
sum(average_medicare_payments) as total_cost
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by dx, city, state
) as sq) as sq2
where dx in (select dx from
(
select drg_definition as dx, sum(total_discharges) as discharge_num
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by dx
order by discharge_num desc
limit 10
)as sq3)
and region in 
(select concat(city,' ',state) as region from
(select provider_city as city, provider_state as state, sum(total_discharges) as discharge_num
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by city, state
order by discharge_num desc
limit 10
)as sq4)
order by dx, total_cost desc
) as sq5
group by dx
) as mint
JOIN 
(select dx, region, round(total_cost,0) as total_cost, discharge_num
from
(
select dx, concat(city,' ',state) as region, total_cost, discharge_num
from
(
select drg_definition as dx, provider_city as city, provider_state as state, sum(total_discharges) as discharge_num,
sum(average_medicare_payments) as total_cost
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by dx, city, state
) as sq) as sq2
where dx in (select dx from
(
select drg_definition as dx, sum(total_discharges) as discharge_num
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by dx
order by discharge_num desc
limit 10
)as sq3)
and region in 
(select concat(city,' ',state) as region from
(select provider_city as city, provider_state as state, sum(total_discharges) as discharge_num
from [bigquery-public-data:medicare.inpatient_charges_2014]
group by city, state
order by discharge_num desc
limit 10
)as sq4)
order by dx, total_cost desc) as maint	
on maint.total_cost = mint.min_cost and maint.dx = mint.dx
order by maint.dx

"""
In [146]:
inpatient_min_df = gbq.read_gbq(minquery_inpatient, project_id = project_id)
Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
Got 10 rows.

Total time taken 1.14 s.
Finished at 2017-05-18 22:56:51.
In [155]:
inpatient_dif_fig = createbarfig(inpatient_max_df, 'maint_dx', 'maint_total_cost',\
                           inpatient_min_df, 'maint_dx', 'maint_total_cost', 'maint_discharge_num', \
                           'maint_region', 'procedure name', 'total cost $',\
                           'Price differential among inpatient Medicare services')
In [156]:
cf.iplot(inpatient_dif_fig)
In [157]:
min_outpatient_query = """

select sq5.apc, sq5.total_payments, sq5.hospital_referral_region, sq5.num_services from 
(select apc, min(total_payments) as min_payments
from
(
select apc, hospital_referral_region, sum(average_total_payments) as total_payments, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
where hospital_referral_region in 
(select hospital_referral_region from 
(
select hospital_referral_region, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
group by hospital_referral_region
order by num_services desc
limit 10) as sq1 
)
and apc in 
(select apc from
(
select apc, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
group by apc
order by num_services desc
limit 10) as sq2
)
group by apc, hospital_referral_region
order by apc, total_payments desc
) group by apc)as sq4
join
(select apc, hospital_referral_region, sum(average_total_payments) as total_payments, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
where hospital_referral_region in 
(select hospital_referral_region from 
(
select hospital_referral_region, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
group by hospital_referral_region
order by num_services desc
limit 10) as sq1 
)
and apc in 
(select apc from
(
select apc, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
group by apc
order by num_services desc
limit 10) as sq2
)
group by apc, hospital_referral_region
order by apc, total_payments desc
)as sq5
on sq4.apc = sq5.apc and sq4.min_payments = sq5.total_payments
order by sq5.apc

"""
In [158]:
outpatient_min_df = gbq.read_gbq(min_outpatient_query, project_id = project_id)
Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
Got 10 rows.

Total time taken 0.88 s.
Finished at 2017-05-18 23:03:27.
In [171]:
outpatient_min_df = outpatient_min_df.drop(outpatient_min_df.index[8])
In [159]:
max_outpatient_query = """

select sq5.apc, sq5.total_payments, sq5.hospital_referral_region, sq5.num_services from 
(select apc, hospital_referral_region, sum(average_total_payments) as total_payments, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
where hospital_referral_region in 
(select hospital_referral_region from 
(
select hospital_referral_region, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
group by hospital_referral_region
order by num_services desc
limit 10) as sq1 
)
and apc in 
(select apc from
(
select apc, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
group by apc
order by num_services desc
limit 10) as sq2
)
group by apc, hospital_referral_region
order by apc, total_payments desc
)as sq5

join 

(select apc, max(total_payments) as max_payments
from
(
select apc, hospital_referral_region, sum(average_total_payments) as total_payments, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
where hospital_referral_region in 
(select hospital_referral_region from 
(
select hospital_referral_region, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
group by hospital_referral_region
order by num_services desc
limit 10) as sq1 
)
and apc in 
(select apc from
(
select apc, sum(outpatient_services) as num_services
from [bigquery-public-data:medicare.outpatient_charges_2014]
group by apc
order by num_services desc
limit 10) as sq2
)
group by apc, hospital_referral_region
order by apc, total_payments desc
) group by apc)as sq6 
on sq5.apc = sq6.apc and sq5.total_payments = sq6.max_payments
order by sq5.apc
"""
In [160]:
outpatient_max_df = gbq.read_gbq(max_outpatient_query, project_id = project_id)
Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
Got 10 rows.

Total time taken 0.97 s.
Finished at 2017-05-18 23:04:20.
In [168]:
outpatient_max_df = outpatient_max_df.drop(outpatient_max_df.index[8])
In [173]:
outpatient_dif_fig = createbarfig(outpatient_max_df, 'sq5_apc', 'sq5_total_payments',\
                           outpatient_min_df, 'sq5_apc', 'sq5_total_payments', 'sq5_num_services', \
                           'sq5_hospital_referral_region', 'procedure name', 'total cost $',\
                           'Price differential among outpatient Medicare services')
In [174]:
cf.iplot(outpatient_dif_fig)